knitr::opts_chunk$set(fig.align="center")
library(rstanarm)
library(tidyverse)
library(tidybayes)
library(modelr)
library(ggplot2)
library(magrittr)
library(emmeans)
library(bayesplot)
library(brms)
library(gganimate)
theme_set(theme_light())
In our experiement, we used a visualization recommendation algorithm (composed of one search algorithm and one oracle algorithm) to generate visualizations for the user on one of two datasets. We then measured the user’s accuracy on two tasks: Find Extremum and Retrieve Value.
Given a search algorithm (bsf or dfs), an oracle (compassql or dziban), and a dataset (birdstrikes or movies), we would like to predict a user’s chance of answering the Find Extremum task and the Retrieve Value tasks correctly. In addition, we would like to know if the choice of search algorithm and oracle has any meaninful impact on a user’s accuracy for these two tasks.
accuracy_data = read.csv('processed_accuracy_split.csv')
accuracy_data$oracle = as.factor(accuracy_data$oracle)
accuracy_data$search = as.factor(accuracy_data$search)
accuracy_data$dataset = as.factor(accuracy_data$dataset)
models <- list()
draw_data <- list()
search_differences <- list()
oracle_differences <- list()
seed = 12
data_find_extremum <- subset(accuracy_data, task == "1. Find Extremum")
models$find_extremum <- brm(accuracy ~ oracle*search*dataset,
data = data_find_extremum,
prior = c(prior(normal(1, .05), class = Intercept)),
family = bernoulli(link = "logit"),
warmup = 500,
iter = 3000,
chains = 2,
cores=2,
seed=seed,
file = "acc_find_extremum"
)
## Compiling Stan program...
## Trying to compile a simple C file
## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## clang -mmacosx-version-min=10.13 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/Rcpp/include/" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/unsupported" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/src/" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppParallel/include/" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DBOOST_NO_AUTO_PTR -include '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/usr/local/include -fPIC -Wall -g -O2 -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Core:88:
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:613:1: error: unknown type name 'namespace'
## namespace Eigen {
## ^
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:613:16: error: expected ';' after top level declarator
## namespace Eigen {
## ^
## ;
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Dense:1:
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Core:96:10: fatal error: 'complex' file not found
## #include <complex>
## ^~~~~~~~~
## 3 errors generated.
## make: *** [foo.o] Error 1
## Start sampling
In the summary table, we want to see Rhat values close to 1.0 and Bulk_ESS in the thousands.
summary(models$find_extremum)
## Family: bernoulli
## Links: mu = logit
## Formula: accuracy ~ oracle * search * dataset
## Data: data_find_extremum (Number of observations: 59)
## Samples: 2 chains, each with iter = 3000; warmup = 500; thin = 1;
## total post-warmup samples = 5000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat
## Intercept 0.66 0.80 -0.83 2.28 1.00
## oracledziban 0.89 1.29 -1.54 3.54 1.00
## searchdfs 0.06 1.20 -2.26 2.54 1.00
## datasetmovies 0.04 1.17 -2.28 2.32 1.00
## oracledziban:searchdfs -0.92 1.73 -4.42 2.46 1.00
## oracledziban:datasetmovies -0.04 1.79 -3.53 3.43 1.00
## searchdfs:datasetmovies 0.75 1.72 -2.63 4.06 1.00
## oracledziban:searchdfs:datasetmovies -0.65 2.44 -5.36 4.18 1.00
## Bulk_ESS Tail_ESS
## Intercept 2280 2229
## oracledziban 1994 2397
## searchdfs 2042 2431
## datasetmovies 2083 2590
## oracledziban:searchdfs 1776 2302
## oracledziban:datasetmovies 1783 2837
## searchdfs:datasetmovies 1736 2485
## oracledziban:searchdfs:datasetmovies 1626 2654
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for model.
plot(models$find_extremum)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differenciating the effect of such parameters).
pairs(models$find_extremum)
A confusion matrix can be used to check our correct classification rate (a useful measure to see how well our model fits our data).
pred <- predict(models$find_extremum, type = "response")
pred <- if_else(pred[,1] > 0.5, 1, 0)
confusion_matrix <- table(pred, pull(data_find_extremum, accuracy))
confusion_matrix
##
## pred 0 1
## 1 5 54
Visualization of parameter effects via draws from our model posterior. The thicker line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
draw_data$find_extremum <- data_find_extremum %>%
add_fitted_draws(models$find_extremum, seed = seed, re_formula = NA) %>%
group_by(search, oracle, dataset, .draw)
draw_data$find_extremum$task <- "1. Find Extremum"
draw_data$find_extremum$condition <- paste(draw_data$find_extremum$oracle, draw_data$find_extremum$search, sep="_")
find_extremum_plot <- draw_data$find_extremum %>% ggplot(aes(
x = .value,
y = condition,
fill = dataset,
alpha = 0.5
)) + stat_halfeye(.width = c(.95, .5)) +
labs(x = "Predicted Accuracy (p_correct)", y = "Oracle/Search Combination")
find_extremum_plot
Since the credible intervals on our plot overlap, we can use mean_qi to get the numeric boundaries for the different intervals.
fit_info <- draw_data$find_extremum %>% group_by(search, oracle, dataset) %>% mean_qi(.value, .width = c(.95, .5))
fit_info
## # A tibble: 16 x 9
## # Groups: search, oracle [4]
## search oracle dataset .value .lower .upper .width .point .interval
## <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 bfs compassql birdstrikes 0.640 0.303 0.907 0.95 mean qi
## 2 bfs compassql movies 0.648 0.315 0.913 0.95 mean qi
## 3 bfs dziban birdstrikes 0.790 0.479 0.972 0.95 mean qi
## 4 bfs dziban movies 0.793 0.491 0.971 0.95 mean qi
## 5 dfs compassql birdstrikes 0.651 0.309 0.919 0.95 mean qi
## 6 dfs compassql movies 0.787 0.487 0.969 0.95 mean qi
## 7 dfs dziban birdstrikes 0.646 0.314 0.913 0.95 mean qi
## 8 dfs dziban movies 0.669 0.363 0.912 0.95 mean qi
## 9 bfs compassql birdstrikes 0.640 0.531 0.760 0.5 mean qi
## 10 bfs compassql movies 0.648 0.537 0.767 0.5 mean qi
## 11 bfs dziban birdstrikes 0.790 0.710 0.894 0.5 mean qi
## 12 bfs dziban movies 0.793 0.717 0.889 0.5 mean qi
## 13 dfs compassql birdstrikes 0.651 0.541 0.772 0.5 mean qi
## 14 dfs compassql movies 0.787 0.710 0.885 0.5 mean qi
## 15 dfs dziban birdstrikes 0.646 0.537 0.768 0.5 mean qi
## 16 dfs dziban movies 0.669 0.572 0.776 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in accuracy between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).
Differences in search algorithms:
find_extremum_predictive_data <- data_find_extremum %>%
add_predicted_draws(models$find_extremum, seed = seed, re_formula = NA) %>%
group_by(search, oracle, dataset, .draw)
search_differences$find_extremum <- find_extremum_predictive_data %>%
group_by(search, dataset, .draw) %>%
summarize(accuracy = weighted.mean(.prediction)) %>%
compare_levels(accuracy, by = search) %>%
rename(difference_in_accuracy = accuracy)
## `summarise()` regrouping output by 'search', 'dataset' (override with `.groups` argument)
search_differences$find_extremum$metric = "1. Find Extremum"
search_differences$find_extremum %>%
ggplot(aes(x = difference_in_accuracy, y = metric, fill = dataset, alpha = 0.5)) +
xlab(paste0("Expected Difference in Accuracy (",search_differences$find_extremum[1,'search'],")")) +
ylab("Task")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal() +
facet_grid(. ~ dataset)
We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.
search_differences$find_extremum %>% mean_qi(difference_in_accuracy, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups: search [1]
## search dataset difference_in_accur… .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dfs - bfs birdstri… -0.0648 -0.5 0.357 0.95 mean qi
## 2 dfs - bfs movies 0.00237 -0.429 0.408 0.95 mean qi
## 3 dfs - bfs birdstri… -0.0648 -0.214 0.0714 0.5 mean qi
## 4 dfs - bfs movies 0.00237 -0.121 0.146 0.5 mean qi
Differences in oracle:
oracle_differences$find_extremum <- find_extremum_predictive_data %>%
group_by(oracle, dataset, .draw) %>%
summarize(accuracy = weighted.mean(.prediction)) %>%
compare_levels(accuracy, by = oracle) %>%
rename(difference_in_accuracy = accuracy)
## `summarise()` regrouping output by 'oracle', 'dataset' (override with `.groups` argument)
oracle_differences$find_extremum$metric = "1. Find Extremum"
oracle_differences$find_extremum %>%
ggplot(aes(x = difference_in_accuracy, y = metric, fill = dataset, alpha = 0.5)) +
xlab(paste0("Expected Difference in Accuracy (",oracle_differences$find_extremum[1,'oracle'],")")) +
ylab("Task")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal() +
facet_grid(. ~ dataset)
We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.
oracle_differences$find_extremum %>% mean_qi(difference_in_accuracy, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups: oracle [1]
## oracle dataset difference_in_acc… .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dziban - c… birdstr… 0.0738 -0.357 0.5 0.95 mean qi
## 2 dziban - c… movies 0.00782 -0.429 0.412 0.95 mean qi
## 3 dziban - c… birdstr… 0.0738 -0.0714 0.214 0.5 mean qi
## 4 dziban - c… movies 0.00782 -0.121 0.146 0.5 mean qi
data_retrieve_value <- subset(accuracy_data, task == "2. Retrieve Value")
models$retrieve_value <- brm(accuracy ~ oracle*search*dataset,
data = data_retrieve_value,
prior = c(prior(normal(1, .05), class = Intercept)),
family = bernoulli(link = "logit"),
warmup = 500,
iter = 3000,
chains = 2,
cores=2,
seed=seed,
file = "acc_retrieve_value"
)
## Compiling Stan program...
## Start sampling
In the summary table, we want to see Rhat values close to 1.0 and Bulk_ESS in the thousands.
summary(models$retrieve_value)
## Family: bernoulli
## Links: mu = logit
## Formula: accuracy ~ oracle * search * dataset
## Data: data_retrieve_value (Number of observations: 59)
## Samples: 2 chains, each with iter = 3000; warmup = 500; thin = 1;
## total post-warmup samples = 5000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat
## Intercept 1.53 0.96 -0.18 3.60 1.00
## oracledziban -0.00 1.46 -2.85 2.93 1.00
## searchdfs -0.00 1.43 -2.85 2.82 1.00
## datasetmovies -1.51 1.32 -4.24 0.94 1.00
## oracledziban:searchdfs -0.88 1.93 -4.66 2.83 1.00
## oracledziban:datasetmovies 0.75 1.86 -2.95 4.33 1.00
## searchdfs:datasetmovies 1.48 1.90 -2.11 5.29 1.00
## oracledziban:searchdfs:datasetmovies -0.61 2.57 -5.55 4.31 1.00
## Bulk_ESS Tail_ESS
## Intercept 1887 1918
## oracledziban 1812 1718
## searchdfs 1799 1893
## datasetmovies 1781 1754
## oracledziban:searchdfs 1725 2026
## oracledziban:datasetmovies 1708 1988
## searchdfs:datasetmovies 1686 2280
## oracledziban:searchdfs:datasetmovies 1685 2221
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for model.
plot(models$retrieve_value)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differenciating the effect of such parameters).
pairs(models$retrieve_value)
A confusion matrix can be used to check our correct classification rate (a useful measure to see how well our model fits our data).
pred <- predict(models$retrieve_value, type = "response")
pred <- if_else(pred[,1] > 0.5, 1, 0)
confusion_matrix <- table(pred, pull(data_retrieve_value, accuracy))
confusion_matrix
##
## pred 0 1
## 0 1 0
## 1 4 54
Visualization of parameter effects via draws from our model posterior. The thicker line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
draw_data$retrieve_value <- data_retrieve_value %>%
add_fitted_draws(models$retrieve_value, seed = seed, re_formula = NA) %>%
group_by(search, oracle, dataset, .draw)
draw_data$retrieve_value$task <- "2. Retrieve Value"
draw_data$retrieve_value$condition <- paste(draw_data$retrieve_value$oracle, draw_data$retrieve_value$search, sep="_")
retrieve_value_plot <- draw_data$retrieve_value %>% ggplot(aes(
x = .value,
y = condition,
fill = dataset,
alpha = 0.5
)) + stat_halfeye(.width = c(.95, .5)) +
labs(x = "Predicted Accuracy (p_correct)", y = "Oracle/Search Combination")
retrieve_value_plot
Since the credible intervals on our plot overlap, we can use mean_qi to get the numeric boundaries for the different intervals.
fit_info <- draw_data$retrieve_value %>% group_by(search, oracle, dataset) %>% mean_qi(.value, .width = c(.95, .5))
fit_info
## # A tibble: 16 x 9
## # Groups: search, oracle [4]
## search oracle dataset .value .lower .upper .width .point .interval
## <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 bfs compassql birdstrikes 0.786 0.454 0.973 0.95 mean qi
## 2 bfs compassql movies 0.504 0.172 0.823 0.95 mean qi
## 3 bfs dziban birdstrikes 0.785 0.460 0.975 0.95 mean qi
## 4 bfs dziban movies 0.663 0.345 0.918 0.95 mean qi
## 5 dfs compassql birdstrikes 0.787 0.471 0.971 0.95 mean qi
## 6 dfs compassql movies 0.786 0.496 0.967 0.95 mean qi
## 7 dfs dziban birdstrikes 0.637 0.305 0.908 0.95 mean qi
## 8 dfs dziban movies 0.661 0.342 0.913 0.95 mean qi
## 9 bfs compassql birdstrikes 0.786 0.706 0.892 0.5 mean qi
## 10 bfs compassql movies 0.504 0.378 0.629 0.5 mean qi
## 11 bfs dziban birdstrikes 0.785 0.702 0.892 0.5 mean qi
## 12 bfs dziban movies 0.663 0.560 0.775 0.5 mean qi
## 13 dfs compassql birdstrikes 0.787 0.706 0.894 0.5 mean qi
## 14 dfs compassql movies 0.786 0.709 0.884 0.5 mean qi
## 15 dfs dziban birdstrikes 0.637 0.522 0.760 0.5 mean qi
## 16 dfs dziban movies 0.661 0.561 0.773 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference between the two search algorithms (bfs and dfs) and the two oracles (dzbian and compassql).
Differences in search algorithms:
retrieve_value_predictive_data <- data_retrieve_value %>%
add_predicted_draws(models$retrieve_value, seed = seed, re_formula = NA) %>%
group_by(search, oracle, dataset, .draw)
search_differences$retrieve_value <- retrieve_value_predictive_data %>%
group_by(search, dataset, .draw) %>%
summarize(accuracy = weighted.mean(.prediction)) %>%
compare_levels(accuracy, by = search) %>%
rename(difference_in_accuracy = accuracy)
## `summarise()` regrouping output by 'search', 'dataset' (override with `.groups` argument)
search_differences$retrieve_value$metric = "2. Retrieve Value"
search_differences$retrieve_value %>%
ggplot(aes(x = difference_in_accuracy, y = metric, fill = dataset, alpha = 0.5)) +
xlab(paste0("Expected Difference in Accuracy (",search_differences$retrieve_value[1,'search'],")")) +
ylab("Task")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal() +
facet_grid(. ~ dataset)
We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.
search_differences$retrieve_value %>% mean_qi(difference_in_accuracy, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups: search [1]
## search dataset difference_in_accu… .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dfs - b… birdstrik… -0.0743 -0.5 0.357 0.95 mean qi
## 2 dfs - b… movies 0.133 -0.304 0.546 0.95 mean qi
## 3 dfs - b… birdstrik… -0.0743 -0.214 0.0714 0.5 mean qi
## 4 dfs - b… movies 0.133 -0.0333 0.279 0.5 mean qi
Differences in oracle:
oracle_differences$retrieve_value <- retrieve_value_predictive_data %>%
group_by(oracle, dataset, .draw) %>%
summarize(accuracy = weighted.mean(.prediction)) %>%
compare_levels(accuracy, by = oracle) %>%
rename(difference_in_accuracy = accuracy)
## `summarise()` regrouping output by 'oracle', 'dataset' (override with `.groups` argument)
oracle_differences$retrieve_value$metric = "2. Retrieve Value"
oracle_differences$retrieve_value %>%
ggplot(aes(x = difference_in_accuracy, y = metric, fill = dataset, alpha = 0.5)) +
xlab(paste0("Expected Difference in Accuracy (",oracle_differences$retrieve_value[1,'oracle'],")")) +
ylab("Task")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal() +
facet_grid(. ~ dataset)
We can double-check the boundaries of the credible intervals to be sure whether or not the interval contains zero.
oracle_differences$retrieve_value %>% mean_qi(difference_in_accuracy, .width = c(.95, .5))
## # A tibble: 4 x 8
## # Groups: oracle [1]
## oracle dataset difference_in_acc… .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dziban - c… birdstri… -0.0727 -0.5 0.357 0.95 mean qi
## 2 dziban - c… movies 0.00309 -0.429 0.417 0.95 mean qi
## 3 dziban - c… birdstri… -0.0727 -0.214 0.0714 0.5 mean qi
## 4 dziban - c… movies 0.00309 -0.167 0.15 0.5 mean qi
Putting the all of the plots for search algorithm and oracle differences on the same plot:
combined_search_differences <- rbind(search_differences$find_extremum, search_differences$retrieve_value)
search_differences_plot <- combined_search_differences %>%
ggplot(aes(x = difference_in_accuracy, y = metric, fill = dataset, alpha = 0.5)) +
xlab(paste0("Expected Difference in Accuracy (",combined_search_differences[1,'search'],")")) +
ylab("Task")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal() +
facet_grid(. ~ dataset)
search_differences_plot
search_intervals <- combined_search_differences %>% group_by(search, dataset, metric) %>% mean_qi(difference_in_accuracy, .width = c(.95, .5))
search_intervals
## # A tibble: 8 x 9
## # Groups: search, dataset [2]
## search dataset metric difference_in_a… .lower .upper .width .point .interval
## <chr> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dfs - … birdst… 1. Fi… -0.0648 -0.5 0.357 0.95 mean qi
## 2 dfs - … birdst… 2. Re… -0.0743 -0.5 0.357 0.95 mean qi
## 3 dfs - … movies 1. Fi… 0.00237 -0.429 0.408 0.95 mean qi
## 4 dfs - … movies 2. Re… 0.133 -0.304 0.546 0.95 mean qi
## 5 dfs - … birdst… 1. Fi… -0.0648 -0.214 0.0714 0.5 mean qi
## 6 dfs - … birdst… 2. Re… -0.0743 -0.214 0.0714 0.5 mean qi
## 7 dfs - … movies 1. Fi… 0.00237 -0.121 0.146 0.5 mean qi
## 8 dfs - … movies 2. Re… 0.133 -0.0333 0.279 0.5 mean qi
Putting the all of the plots for oracle differences on the same plot:
combined_oracle_differences <- rbind(oracle_differences$find_extremum, oracle_differences$retrieve_value)
oracle_differences_plot <- combined_oracle_differences %>%
ggplot(aes(x = difference_in_accuracy, y = metric, fill = dataset, alpha = 0.5)) +
xlab(paste0("Expected Difference in Accuracy (",combined_oracle_differences[1,'oracle'],")")) +
ylab("Task")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal() +
facet_grid(. ~ dataset)
oracle_differences_plot
oracle_intervals <- combined_oracle_differences %>% group_by(oracle, dataset, metric) %>% mean_qi(difference_in_accuracy, .width = c(.95, .5))
oracle_intervals
## # A tibble: 8 x 9
## # Groups: oracle, dataset [2]
## oracle dataset metric difference_in_a… .lower .upper .width .point .interval
## <chr> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 dziban… birdst… 1. Fi… 0.0738 -0.357 0.5 0.95 mean qi
## 2 dziban… birdst… 2. Re… -0.0727 -0.5 0.357 0.95 mean qi
## 3 dziban… movies 1. Fi… 0.00782 -0.429 0.412 0.95 mean qi
## 4 dziban… movies 2. Re… 0.00309 -0.429 0.417 0.95 mean qi
## 5 dziban… birdst… 1. Fi… 0.0738 -0.0714 0.214 0.5 mean qi
## 6 dziban… birdst… 2. Re… -0.0727 -0.214 0.0714 0.5 mean qi
## 7 dziban… movies 1. Fi… 0.00782 -0.121 0.146 0.5 mean qi
## 8 dziban… movies 2. Re… 0.00309 -0.167 0.15 0.5 mean qi